*************
* Prep geography movers ;

options compress=YES;

	proc datasets lib=work kill nolist memtype=data;
	libname eer '';
	libname interm '';

***
** MARK MSA OF EACH COUNTY;
* Crosswalk from NBER, http://www.nber.org/data/cbsa-msa-fips-ssa-county-crosswalk.html;
* Also merge CZs from David Dorn, https://www.ddorn.net/data.htm;

	proc import datafile = "cbsa_msa_xwalk.csv" dbms = csv out = raw_msa replace; guessingrows = 300; run;
	proc import datafile = "cw_cty_czone.csv" dbms = csv out = raw_cz replace; guessingrows = 300; run;
	proc sql noprint;
		CREATE TABLE cnty AS SELECT lower(compress(geoname, ' -.')) AS cname, lower(statename) AS state, GEOCODE AS geo
		FROM eer.geocode WHERE geoname ~= "" AND trim(geoname) ~= "zstatewide";

		CREATE TABLE msa AS SELECT lower(compress(countyname, ' -.')) AS cname, lower(state) AS state, fipscounty, msa 
		FROM raw_msa;

		CREATE TABLE temp AS SELECT * FROM cnty A LEFT JOIN msa B ON A.cname = B.cname AND A.state = B.state;

		CREATE TABLE cz AS SELECT cty_fips AS fipscounty, czone AS cz FROM raw_cz;

		CREATE TABLE geolink AS SELECT * FROM temp A LEFT JOIN cz B ON A.fipscounty = B.fipscounty ORDER BY state, fipscounty;


***
** APPEND GEO DATA ACROSS YEARS;

	%MACRO AppendYear(YR);

		%PUT ++ Append &YR.;
	
		** Load data;
		%if &YR. = 1993 %then %do;
			proc sql noprint;
				CREATE TABLE temp AS SELECT idd - 1234567890 AS ssn, min(1000 + input(YOB, 4.)) AS byr, GEOCODE AS geo, &YR. AS year
				FROM eer.eer&YR. (pw = colin) WHERE GEOSRCE = "1" AND 1000 + input(YOB, 4.) >= 1930 AND 1000 + input(YOB, 4.) <= 1950
				GROUP BY idd;
		%end;
		%if &YR. >= 1994 %then %do;     
			proc sql noprint;
				CREATE TABLE temp AS SELECT idd - 1234567890 AS ssn, min(input(YOB, 4.)) AS byr, GEO AS geo, &YR. AS year
				FROM eer.eer&YR. (pw = colin) WHERE GEOSRCE = "1" AND input(YOB, 4.) >= 1930 AND input(YOB, 4.) <= 1950
				GROUP BY idd;
		%end;

		** Merge with MSA/ CZ definitions;
		proc sql noprint;
			CREATE TABLE wmsa AS SELECT * FROM temp A LEFT JOIN geolink B ON A.geo = B.geo;			
	
		** For simplicity, keep lowest county;
		proc sort data = wmsa nodupkey; by ssn fipscounty; run;
		data wmsa; set wmsa; by ssn; if first.ssn then seq = 0; seq + 1; run;
		proc sql noprint; CREATE TABLE wmsa AS SELECT *, max(seq) AS num_fips FROM wmsa GROUP BY ssn;
		data wmsa; set wmsa; by ssn; num_fips = max(seq); run;
		data wmsa (drop = seq); set wmsa; where seq = 1; run;

		** Append;
		%if &YR. = 1993 %then %do; 
			data interm.geo; set wmsa; run;
		%end;
		%if &YR. > 1993 %then %do;
			data interm.geo; set interm.geo wmsa; run;
		%end;

	%MEND;


	%AppendYear(1993); %AppendYear(1994); %AppendYear(1995); %AppendYear(1996); 
	%AppendYear(1997); %AppendYear(1998); %AppendYear(1999); %AppendYear(2000); 
	%AppendYear(2001); %AppendYear(2002); %AppendYear(2003); %AppendYear(2004);
	%AppendYear(2005); %AppendYear(2006); %AppendYear(2007); %AppendYear(2008);
	%AppendYear(2009); %AppendYear(2010);

	data interm.geo ;
		set interm.geo (keep = ssn byr geo year) ;
	run ;

	proc sort data=interm.geo out=interm.geo nodupkey ;
		by ssn year geo ;
	run;

	proc means min max data=interm.geo ;
		var year ;
	run ;

* Within SSN, delete if geo is the same as the previous observation (previous year) ;
* Reshape wide ;

data interm.geo;
	set interm.geo;
	by ssn;

	keep 	ssn 
			byr
			geo1993-geo2010
			;	

	retain 	geo1993-geo2010 ;

	array	ageo(1993:2010)	$5 geo1993-geo2010 ;

	if first.ssn then
	do;
		do i=1993 to 2010;
			ageo(i) = "";
		end;
	end;

	ageo(year) = geo;

	if last.ssn then output;

run;

* Delete observations that repeat previous year ;
data interm.geo ;
	set interm.geo ;

		array geo(1993:2010) geo1993-geo2010 ;
		array ngeo(1993:2010) ngeo1993-ngeo2010 ;

		ngeo(1993) = geo(1993) ;
		do i = 1994 to 2010 ;
			if geo(i) ^= geo(i-1) then ngeo(i) = geo(i) ;
				else ngeo(i) = . ;
		end ;

		drop geo: i ;
run ;

* Reshape long ;
data interm.geo;
	set interm.geo;

	array 	angeo(1993:2010) 	ngeo1993-ngeo2010 ;
	
	do year = 1993 to 2010;
		ngeo = angeo(year);
		output;
	end;

	drop 	ngeo1993-ngeo2010 ;	
run;

data interm.geo;
	set interm.geo
		(where=(geo^=.)
		 rename=(ngeo=geo)
		) ;
run ;

data interm.geo;
	set interm.geo 
	(rename=(ssn=ssn_num));

	ssn = put(ssn_num,z9.);
	drop ssn_num ;
run ;

run; quit;
